{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys,random\n",
    "import joblib\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split \n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn import metrics\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def timer (func):\n",
    "    '''\n",
    "    描述：\n",
    "        装饰器函数：计时器\n",
    "    '''\n",
    "    def wrapper(*args,**kwargs): \n",
    "        start = time.time()\n",
    "        result = func(*args,**kwargs)\n",
    "        end = time.time()\n",
    "        print(func.__name__+'运行时间：',end-start)\n",
    "        return result\n",
    "    return wrapper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.加载有标签的训练集（前33465行）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "input_dir = '../../preprocess_data_new/'\n",
    "data_date = joblib.load(input_dir + 'train_ax_date.lz4')[:33465]\n",
    "data_nodup = joblib.load(input_dir + 'train_ax_nodup.lz4').drop(columns=['id','loan_dt'])[:33465]\n",
    "data_label = joblib.load(input_dir + 'train_y_33465.lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data = pd.concat([data_date,data_nodup],axis=1,ignore_index=True, copy=False)  速度过于缓慢，原因未知！\n",
    "# 特征拼接\n",
    "data_nodup.fillna(-1,inplace=True)\n",
    "x = np.hstack((data_date.values, data_nodup.values))\n",
    "\n",
    "y = data_label['label'].values\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.划分本地的训练集和测试集，测试各种学习器在本数据集上的性能"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SelectModel(model_name):\n",
    "    '''\n",
    "    描述：\n",
    "        选择模型，返回初始化的模型对象（都是sklearn接口的）\n",
    "    '''\n",
    "    if model_name == 'GBC':\n",
    "        from sklearn.ensemble import GradientBoostingClassifier\n",
    "        model = GradientBoostingClassifier(loss='deviance',\n",
    "                                           learning_rate =0.1,\n",
    "                                           n_estimators=200,\n",
    "                                           subsample=0.9,\n",
    "                                           max_depth=3,\n",
    "                                          random_state=2018)\n",
    "    elif model_name == 'XGB':\n",
    "        from xgboost import XGBClassifier\n",
    "\n",
    "        model = XGBClassifier(max_depth=5,\n",
    "                              learning_rate =0.05, \n",
    "                              booster='gbtree',\n",
    "                              objective='binary:logistic',\n",
    "                              early_stopping_rounds=100,\n",
    "                              scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                              eval_metric='auc',\n",
    "                              seed=2018,\n",
    "                              n_jobs=8,\n",
    "                              num_boost_round = 200\n",
    "                             )\n",
    "    elif model_name == 'RFC':\n",
    "        from sklearn.ensemble import RandomForestClassifier\n",
    "        model = RandomForestClassifier(n_estimators=1000,\n",
    "                                       n_jobs =8,\n",
    "                                       max_features='sqrt',\n",
    "                                       class_weight='balanced',\n",
    "                                       random_state=2018)\n",
    "    elif model_name == 'LGB':\n",
    "        # 参数修改记录：\n",
    "        # 对比1：max_depth: -1 ->5\n",
    "        # 对比3：max_depth：5->8,min_data_in_leaf:60->20\n",
    "        # 对比4：max_depth：8->4,min_data_in_leaf:20->40, num_boost_round:200->300, num_leaves = 135->100，num_threads：8->24\n",
    "        # 对比5：max_depth：4->3,min_data_in_leaf:40->100, num_boost_round:300->500, num_leaves = 100->200，num_threads：8->24，max_bin：200->250\n",
    "        # 对比5：max_depth：4->6,min_data_in_leaf:100->150, num_boost_round:500->350, num_leaves = 200->300，num_threads：8->24，max_bin：200->250\n",
    "        from lightgbm import LGBMClassifier\n",
    "        model = LGBMClassifier(boost='gbdt',\n",
    "                    num_leaves=300, \n",
    "                    scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                    max_depth=6,\n",
    "                    learning_rate=.05,\n",
    "                    max_bin=250,\n",
    "                    min_data_in_leaf= 150,\n",
    "                    objective='binary',\n",
    "                    metric='auc',\n",
    "                    num_threads=24,\n",
    "                    slient=False,\n",
    "                    num_boost_round = 350)\n",
    "    else:\n",
    "        pass\n",
    "    return model\n",
    "\n",
    "@timer\n",
    "def train_model(x_train, x_test, y_train, y_test, model_name):\n",
    "    '''\n",
    "    描述:\n",
    "        训练模型，并打印训练时间和auc分数\n",
    "    '''\n",
    "    model = SelectModel(model_name)\n",
    "    model.fit(x_train,y_train)\n",
    "    if not os.path.exists('./comparation_model'):\n",
    "        os.mkdir('comparation_model')\n",
    "    joblib.dump(model, './comparation_model/%s_model'%model_name)\n",
    "    pred_test = model.predict(x_test)\n",
    "    auc = metrics.roc_auc_score(y_test, pred_test)\n",
    "    print('test-auc(%s):'%model_name,auc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比0（学习器性能对比）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "#1.分割数据\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(RFC): 0.5082499734460394\n",
      "train_model运行时间： 80.6416265964508\n",
      "test-auc(XGB): 0.7180516756363665\n",
      "train_model运行时间： 62.886208057403564\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.5549797310197829\n",
      "train_model运行时间： 125.06994533538818\n",
      "test-auc(GBC): 0.5287540295227912\n",
      "train_model运行时间： 1509.906184911728\n"
     ]
    }
   ],
   "source": [
    "#2.训练模型 \n",
    "model_names = ['RFC','XGB','LGB','GBC']\n",
    "for model_name in model_names:\n",
    "    train_model(x_train, x_test, y_train, y_test, model_name)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比1：\n",
    "把lightgbm的参数max_depth由-1改为5，发现auc由0.55提升至0.70，运行时间由125s缩短至32s，性能提升！"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.7035308948000503\n",
      "train_model运行时间： 31.934845447540283\n"
     ]
    }
   ],
   "source": [
    "train_model(x_train, x_test, y_train, y_test, 'LGB')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比2：\n",
    "更换特征data_nodup 为 data_raw  \n",
    "auc由0.7035变为0.6998，看起来data_nodup效果更佳，但是也有可能是因为lightgbm设置的参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.6998177388276248\n",
      "train_model运行时间： 36.55933976173401\n"
     ]
    }
   ],
   "source": [
    "data_raw = joblib.load(input_dir + 'train_ax.lz4').drop(columns=['id','loan_dt'])[:33465]\n",
    "data_raw.fillna(-1,inplace=True)\n",
    "x = np.hstack((data_date.values, data_raw.values))\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.使用data_raw\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.6118573371362515\n",
      "train_model运行时间： 75.4497618675232\n",
      "2.使用data_nodup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.6001546971838466\n",
      "train_model运行时间： 68.99368691444397\n"
     ]
    }
   ],
   "source": [
    "# lightgbm 使用新的参数组合\n",
    "# 1.使用data_raw\n",
    "x = np.hstack((data_date.values, data_raw.values))\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "print('1.使用data_raw')\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')\n",
    "\n",
    "# 2.使用data_nodup\n",
    "x = np.hstack((data_date.values, data_nodup.values))\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "print('2.使用data_nodup')\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.使用data_raw\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.7206630866063126\n",
      "train_model运行时间： 24.63110637664795\n",
      "2.使用data_nodup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.7137290818324644\n",
      "train_model运行时间： 23.690422773361206\n"
     ]
    }
   ],
   "source": [
    "# lightgbm 使用新的参数组合\n",
    "# 1.使用data_raw\n",
    "x = np.hstack((data_date.values, data_raw.values))\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "print('1.使用data_raw')\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')\n",
    "\n",
    "# 2.使用data_nodup\n",
    "x = np.hstack((data_date.values, data_nodup.values))\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "print('2.使用data_nodup')\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.使用data_raw\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.7325156016736488\n",
      "train_model运行时间： 29.611547231674194\n",
      "2.使用data_nodup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.7390925126953997\n",
      "train_model运行时间： 26.568469762802124\n"
     ]
    }
   ],
   "source": [
    "# lightgbm 使用新的参数组合\n",
    "# 1.使用data_raw\n",
    "x = np.hstack((data_date.values, data_raw.values))\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "print('1.使用data_raw')\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')\n",
    "\n",
    "# 2.使用data_nodup\n",
    "x = np.hstack((data_date.values, data_nodup.values))\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "print('2.使用data_nodup')\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.使用data_raw\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.669891099438229\n",
      "train_model运行时间： 42.93098282814026\n",
      "2.使用data_nodup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test-auc(LGB): 0.6643234018262608\n",
      "train_model运行时间： 39.20240902900696\n"
     ]
    }
   ],
   "source": [
    "# lightgbm 使用新的参数组合\n",
    "# 1.使用data_raw\n",
    "x = np.hstack((data_date.values, data_raw.values))\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "print('1.使用data_raw')\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')\n",
    "\n",
    "# 2.使用data_nodup\n",
    "x = np.hstack((data_date.values, data_nodup.values))\n",
    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)\n",
    "print('2.使用data_nodup')\n",
    "train_model(x_train, x_test, y_train, y_test, 'LGB')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对比总结\n",
    "1. 各个学习器的性能（对比1、2）  \n",
    "test-auc(RFC): 0.5082499734460394, train_model运行时间： 80.6416265964508  \n",
    "test-auc(XGB): 0.7180516756363665, train_model运行时间： 62.886208057403564  \n",
    "test-auc(LGB): 0.5549797310197829, train_model运行时间： 125.06994533538818  \n",
    "test-auc(GBC): 0.5287540295227912, train_model运行时间： 1509.906184911728\n",
    "2. data_nodup降维的效果验证  \n",
    "通过对比2、3、4、5，我们发现：  \n",
    "降维之后的数据集data_nodup和降维之前的数据集data_raw在test-auc分数基本相同，  \n",
    "可见data_nodup在降低维度的同时又保留data_raw绝大部分的信息，这是非常好的！"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.历史的一些记录，已经没有代码了"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  4.1 (代码整理前的记录)测试本地集auc分数\n",
    "保留原始数据的记录：  \n",
    "rfc AUC: 0.805099223842394  \n",
    "gbc AUC: 0.8293387542510773   \n",
    "xgb AUC: 0.8313812914152184  \n",
    "lgb AUC: 0.8430383986560764\n",
    "\n",
    " \n",
    "linearn-svm AUC: 0.5075515879920058  线性不可分    \n",
    "logistic regreesion AUC: 0.551876338694301 不适用于此类数据  \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.2 (代码整理前的记录)线上valid-auc分数"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1.xgboost：\n",
    "raw 去除nan列 + 统计null AUC: 0.82751113123698  \n",
    "nodup + null + tag AUC: 0.82803008109167  \n",
    "nodup + null + tag（rank融合）AUC:0.8279914450872  \n",
    "nodup + null + tag (fillna(-1)) AUC:0.82979480823375  \n",
    "\n",
    "2.GradientBoostingClassifier AUC:0.81958272831703  \n",
    "\n",
    "3.RFC AUC:0.78512734928805   \n",
    "\n",
    "4.20个lgb AUC:0.8079  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
